In [1]:
# Libraries for parsing data
import os
import pandas as pd
import xml.etree.ElementTree as ET
from lxml import etree
from bs4 import BeautifulSoup
import re
import numpy as np
In [2]:
path_dropbox = "D:/Dropbox/Research/China Foreign Share Discount"

Call-Participant data¶

In [3]:
calls = pd.read_pickle(path_dropbox + '/Conference Call Transcript/transcript.pkl')
calls['ctry'] = calls['ISIN'].str[:2]
calls = calls[~calls['ctry'].isin([None])] #delete missing only
In [4]:
institutions_column = []
for participants in calls['participants']:
    institutions = []
    for participant in participants:
        person_institution = participant.split('\n')
        if len(person_institution) == 2:
            institution_position = person_institution[1].upper()
            institution = institution_position.split('-')[0]
            if len(institution_position.split('-')) == 2:
                position = institution_position.split('-')[1]
                if 'CEO' in position:
                    continue
                if 'CFO' in position:
                    continue
            institution = institution.replace('[', '')
            institution = institution.replace(']', '')
            institution = institution.replace('&', '&')
            institution = institution.strip()
            institutions.append(institution)
    institutions_column.append(institutions)
In [5]:
calls['institutions'] = institutions_column
In [6]:
calls_explode = calls.explode('institutions')
In [7]:
ctry_code = pd.read_excel(path_dropbox + '\Conference Call Transcript\country code.xlsx')
In [8]:
calls_explode = calls_explode.merge(ctry_code, left_on = 'ctry', right_on = 'Alpha-2 code')

Bring in participant-country dataset¶

In [9]:
institution_ctry_nonus = pd.read_excel(path_dropbox + '\Conference Call Transcript\institution_nonus.xlsx', sheet_name='nonus')
In [10]:
institution_ctry_remain = pd.read_excel(path_dropbox + '\Conference Call Transcript\institution headquarters_remain2.xlsx', sheet_name='Institution')
In [11]:
institution_ctry_remain.rename(columns={"Country 1": "Country1", "Country 2": "Country2"}, inplace=True)
In [12]:
institution_ctry = pd.concat([institution_ctry_nonus, institution_ctry_remain])
In [13]:
calls_explode = calls_explode.merge(institution_ctry, left_on = 'institutions', right_on = 'Institution')
In [14]:
calls_final = calls_explode[~calls_explode['Country1'].isna()]
In [15]:
calls_final['Date'] = pd.to_datetime(calls_final['Date'])
calls_final['year'] = calls_final['Date'].dt.year
calls_final['quarter'] = calls_final['Date'].dt.quarter
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_27468\1114554688.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calls_final['Date'] = pd.to_datetime(calls_final['Date'])
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_27468\1114554688.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calls_final['year'] = calls_final['Date'].dt.year
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_27468\1114554688.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calls_final['quarter'] = calls_final['Date'].dt.quarter
In [16]:
calls_final['foreign'] = (calls_final['Country1'] != calls_final['Alpha-3 code']) | ((calls_final['Country2'].notna()) & (calls_final['Country2'] != calls_final['Alpha-3 code']))
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_27468\1998467780.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calls_final['foreign'] = (calls_final['Country1'] != calls_final['Alpha-3 code']) | ((calls_final['Country2'].notna()) & (calls_final['Country2'] != calls_final['Alpha-3 code']))
In [17]:
calls_final = calls_final.rename(columns={"Alpha-3 code": "firm_ctry",
                                         "Country1": "part_ctry1",
                                         "Country2": "part_ctry2"})
In [18]:
calls_final.drop(['Error code', 'Error Description', 'Date'], axis=1, inplace=True)
calls_final['participants'] = calls_final['participants'].astype(str).replace('nan', np.nan)
calls_final['part_ctry2'] = calls_final['part_ctry2'].astype(str).replace('nan', np.nan)
In [19]:
calls_final.to_stata(path_dropbox + "\Conference Call Transcript\calls_part_full_0227.dta", version=118, write_index = False)
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_27468\3662744895.py:1: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    Alpha-2 code   ->   Alpha_2_code

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)

  calls_final.to_stata(path_dropbox + "\Conference Call Transcript\calls_part_full_0227.dta", version=118, write_index = False)